library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)
DARWIN <- read.csv("~/GitHub/FCA/Data/DARWIN/DARWIN.csv")
rownames(DARWIN) <- DARWIN$ID
DARWIN$ID <- NULL
DARWIN$class <- 1*(DARWIN$class=="P")
print(table(DARWIN$class))
#>
#> 0 1
#> 85 89
DARWIN[,1:ncol(DARWIN)] <- sapply(DARWIN,as.numeric)
signedlog <- function(x) { return (sign(x)*log(abs(1.0e12*x)+1.0))}
whof <- !(colnames(DARWIN) %in% c("class"));
DARWIN[,whof] <- signedlog(DARWIN[,whof])
studyName <- "DARWIN"
dataframe <- DARWIN
outcome <- "class"
TopVariables <- 10
thro <- 0.80
cexheat = 0.15
Some libraries
library(psych)
library(whitening)
library("vioplot")
pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
| rows | col |
|---|---|
| 174 | 450 |
pander::pander(table(dataframe[,outcome]))
| 0 | 1 |
|---|---|
| 85 | 89 |
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
largeSet <- length(varlist) > 1000
Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns
### Some global cleaning
sdiszero <- apply(dataframe,2,sd) > 1.0e-16
dataframe <- dataframe[,sdiszero]
varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
dataframe <- dataframe[,tokeep]
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
dataframe <- FRESAScale(dataframe,method="OrderLogit")$scaledData
if (!largeSet)
{
hm <- heatMaps(data=dataframe,
Outcome=outcome,
Scale=TRUE,
hCluster = "row",
xlab="Feature",
ylab="Sample",
srtCol=45,
srtRow=45,
cexCol=cexheat,
cexRow=cexheat
)
par(op)
}
The heat map of the data
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
#cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
cormat <- cor(dataframe[,varlist],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Original Correlation",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.9992136
DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#>
#> Included: 450 , Uni p: 0.006350853 , Uncorrelated Base: 268 , Outcome-Driven Size: 0 , Base Size: 268
#>
#>
1 <R=0.999,w= 1,N= 82>, Top: 41( 1 )[ 1 : 41 : 0.975 ]( 41 , 41 , 0 ),<|>Tot Used: 82 , Added: 41 , Zero Std: 0 , Max Cor: 0.974
#>
2 <R=0.974,w= 1,N= 82>, Top: 18( 1 )[ 1 : 18 : 0.962 ]( 18 , 19 , 41 ),<|>Tot Used: 118 , Added: 19 , Zero Std: 0 , Max Cor: 0.960
#>
3 <R=0.960,w= 1,N= 82>, Top: 8( 1 )[ 1 : 8 : 0.955 ]( 8 , 8 , 58 ),<|>Tot Used: 134 , Added: 8 , Zero Std: 0 , Max Cor: 0.955
#>
4 <R=0.955,w= 2,N= 43>, Top: 21( 1 )[ 1 : 21 : 0.927 ]( 21 , 22 , 66 ),<|>Tot Used: 174 , Added: 22 , Zero Std: 0 , Max Cor: 0.927
#>
5 <R=0.927,w= 2,N= 43>, Top: 9( 1 )[ 1 : 9 : 0.914 ]( 9 , 10 , 84 ),<|>Tot Used: 188 , Added: 10 , Zero Std: 0 , Max Cor: 0.912
#>
6 <R=0.912,w= 2,N= 43>, Top: 2( 1 )[ 1 : 2 : 0.906 ]( 2 , 2 , 89 ),<|>Tot Used: 191 , Added: 2 , Zero Std: 0 , Max Cor: 0.906
#>
7 <R=0.906,w= 3,N= 107>, Top: 49( 2 )[ 1 : 49 : 0.853 ]( 48 , 51 , 90 ),<|>Tot Used: 277 , Added: 51 , Zero Std: 0 , Max Cor: 0.913
#>
8 <R=0.913,w= 3,N= 107>, Top: 6( 1 )[ 1 : 6 : 0.856 ]( 6 , 6 , 130 ),<|>Tot Used: 281 , Added: 6 , Zero Std: 0 , Max Cor: 0.853
#>
9 <R=0.853,w= 4,N= 57>, Top: 29( 1 )[ 1 : 29 : 0.800 ]( 28 , 28 , 132 ),<|>Tot Used: 308 , Added: 28 , Zero Std: 0 , Max Cor: 0.929
#>
10 <R=0.929,w= 4,N= 57>, Top: 4( 1 )[ 1 : 4 : 0.814 ]( 4 , 4 , 143 ),<|>Tot Used: 308 , Added: 4 , Zero Std: 0 , Max Cor: 0.808
#>
11 <R=0.808,w= 5,N= 4>, Top: 2( 1 )[ 1 : 2 : 0.800 ]( 2 , 2 , 144 ),<|>Tot Used: 308 , Added: 2 , Zero Std: 0 , Max Cor: 0.797
#>
12 <R=0.000,w= 6,N= 0>
#>
[ 12 ], 0.7965355 Decor Dimension: 308 . Cor to Base: 163 , ABase: 131 , Outcome Base: 0
#>
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]
pander::pander(sum(apply(dataframe[,varlist],2,var)))
489
pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))
333
pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))
4.9
pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))
4.58
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
UPSTM <- attr(DEdataframe,"UPSTM")
gplots::heatmap.2(1.0*(abs(UPSTM)>0),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Decorrelation matrix",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
}
if (!largeSet)
{
cormat <- cor(DEdataframe[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Correlation after IDeA",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
par(op)
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.7965355
classes <- unique(dataframe[,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[,outcome],col=raincolors[dataframe[,outcome]+1])
datasetframe.umap = umap(scale(DEdataframe[,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[,outcome],col=raincolors[DEdataframe[,outcome]+1])
univarRAW <- uniRankVar(varlist,
paste(outcome,"~1"),
outcome,
dataframe,
rankingTest="AUC")
100 : mean_jerk_in_air6 200 : disp_index12 300 : mean_speed_in_air17 400 : gmrt_on_paper23
univarDe <- uniRankVar(varlistc,
paste(outcome,"~1"),
outcome,
DEdataframe,
rankingTest="AUC",
)
100 : La_mean_jerk_in_air6 200 : La_disp_index12 300 : La_mean_speed_in_air17 400 : gmrt_on_paper23
univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")
##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| total_time23 | 0.767 | 0.909 | -0.366 | 0.736 | 6.93e-05 | 0.863 |
| total_time15 | 0.775 | 1.062 | -0.442 | 0.572 | 4.78e-01 | 0.844 |
| air_time23 | 0.599 | 0.766 | -0.374 | 0.715 | 2.31e-02 | 0.844 |
| air_time15 | 0.684 | 1.112 | -0.506 | 0.669 | 7.09e-01 | 0.829 |
| total_time17 | 0.806 | 1.082 | -0.400 | 0.966 | 3.10e-02 | 0.824 |
| paper_time23 | 0.690 | 1.106 | -0.435 | 0.703 | 6.55e-01 | 0.814 |
| air_time17 | 0.674 | 0.980 | -0.378 | 0.863 | 8.86e-02 | 0.806 |
| paper_time17 | 0.664 | 1.045 | -0.413 | 0.929 | 1.79e-01 | 0.796 |
| total_time6 | 0.680 | 1.069 | -0.364 | 0.665 | 7.13e-01 | 0.790 |
| air_time16 | 0.426 | 0.841 | -0.414 | 0.650 | 8.51e-01 | 0.787 |
topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]
theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]
pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| air_time23 | 0.5993 | 0.766 | -0.37359 | 0.715 | 2.31e-02 | 0.844 |
| air_time15 | 0.6835 | 1.112 | -0.50588 | 0.669 | 7.09e-01 | 0.829 |
| air_time17 | 0.6742 | 0.980 | -0.37796 | 0.863 | 8.86e-02 | 0.806 |
| air_time16 | 0.4258 | 0.841 | -0.41386 | 0.650 | 8.51e-01 | 0.787 |
| disp_index23 | 0.5808 | 0.924 | -0.35306 | 0.816 | 3.70e-01 | 0.787 |
| air_time6 | 0.5641 | 0.982 | -0.41988 | 0.746 | 6.19e-01 | 0.784 |
| air_time7 | 0.5315 | 0.829 | -0.23828 | 0.882 | 7.92e-03 | 0.779 |
| gmrt_in_air7 | -0.4478 | 0.811 | 0.42274 | 0.794 | 9.97e-01 | 0.775 |
| paper_time9 | 0.4679 | 0.890 | -0.41181 | 0.710 | 7.56e-01 | 0.774 |
| air_time2 | 0.3619 | 0.810 | -0.44747 | 0.699 | 1.52e-01 | 0.773 |
| La_total_time5 | 0.2557 | 0.509 | 0.00354 | 0.164 | 1.13e-07 | 0.730 |
| La_mean_speed_on_paper13 | -0.0359 | 0.115 | 0.02486 | 0.195 | 1.90e-05 | 0.728 |
| La_mean_speed_on_paper2 | -0.1135 | 0.389 | 0.08430 | 0.290 | 1.43e-08 | 0.716 |
| La_disp_index21 | -0.4089 | 0.624 | -0.01813 | 0.304 | 8.89e-01 | 0.706 |
| La_paper_time3 | 0.2542 | 0.406 | 0.00794 | 0.231 | 5.43e-01 | 0.693 |
dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")
theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))
theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)
pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
| mean | total | fraction |
|---|---|---|
| 2.11 | 172 | 0.382 |
allSigvars <- names(dc)
dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
coef <- theFormulas[[dx]]
cname <- names(theFormulas[[dx]])
names(cname) <- cname
for (cf in names(coef))
{
if (cf != dx)
{
if (coef[cf]>0)
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
}
else
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("%5.3f*%s",coef[cf],cname[cf]))
}
}
}
}
finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])
orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]
Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")
finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
| DecorFormula | caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | RAWAUC | fscores | |
|---|---|---|---|---|---|---|---|---|---|
| air_time23 | 0.5993 | 0.766 | -0.37359 | 0.715 | 2.31e-02 | 0.844 | 0.844 | 1 | |
| air_time15 | 0.6835 | 1.112 | -0.50588 | 0.669 | 7.09e-01 | 0.829 | 0.829 | 1 | |
| air_time17 | 0.6742 | 0.980 | -0.37796 | 0.863 | 8.86e-02 | 0.806 | 0.806 | 1 | |
| air_time16 | 0.4258 | 0.841 | -0.41386 | 0.650 | 8.51e-01 | 0.787 | 0.787 | 1 | |
| disp_index23 | 0.5808 | 0.924 | -0.35306 | 0.816 | 3.70e-01 | 0.787 | 0.787 | 1 | |
| air_time6 | 0.5641 | 0.982 | -0.41988 | 0.746 | 6.19e-01 | 0.784 | 0.784 | 1 | |
| air_time7 | 0.5315 | 0.829 | -0.23828 | 0.882 | 7.92e-03 | 0.779 | 0.779 | 1 | |
| gmrt_in_air7 | -0.4478 | 0.811 | 0.42274 | 0.794 | 9.97e-01 | 0.775 | 0.775 | 1 | |
| paper_time9 | 0.4679 | 0.890 | -0.41181 | 0.710 | 7.56e-01 | 0.774 | 0.774 | 2 | |
| air_time2 | 0.3619 | 0.810 | -0.44747 | 0.699 | 1.52e-01 | 0.773 | 0.773 | 1 | |
| La_total_time5 | -0.813paper_time5 + 1.000total_time5 | 0.2557 | 0.509 | 0.00354 | 0.164 | 1.13e-07 | 0.730 | 0.674 | -1 |
| La_mean_speed_on_paper13 | -0.971gmrt_on_paper13 + 1.000mean_speed_on_paper13 | -0.0359 | 0.115 | 0.02486 | 0.195 | 1.90e-05 | 0.728 | 0.626 | -1 |
| mean_speed_on_paper2 | NA | -0.3422 | 0.901 | 0.35546 | 0.928 | 4.91e-01 | 0.720 | 0.720 | NA |
| La_mean_speed_on_paper2 | -0.878gmrt_on_paper2 + 1.000mean_speed_on_paper2 | -0.1135 | 0.389 | 0.08430 | 0.290 | 1.43e-08 | 0.716 | 0.720 | -1 |
| paper_time3 | NA | 0.3381 | 1.043 | -0.40899 | 0.989 | 6.14e-01 | 0.715 | 0.715 | NA |
| La_disp_index21 | + 1.000disp_index21 -0.906paper_time21 | -0.4089 | 0.624 | -0.01813 | 0.304 | 8.89e-01 | 0.706 | 0.538 | -1 |
| La_paper_time3 | -0.946disp_index3 + 1.000paper_time3 | 0.2542 | 0.406 | 0.00794 | 0.231 | 5.43e-01 | 0.693 | 0.715 | -1 |
| total_time5 | NA | 0.3576 | 1.238 | -0.21141 | 0.766 | 6.87e-01 | 0.674 | 0.674 | NA |
| disp_index3 | NA | 0.0887 | 1.012 | -0.44082 | 1.088 | 4.68e-01 | 0.669 | 0.669 | 1 |
| gmrt_on_paper2 | NA | -0.2603 | 0.961 | 0.30870 | 1.034 | 9.26e-01 | 0.663 | 0.663 | 2 |
| paper_time5 | NA | 0.1254 | 1.429 | -0.26454 | 0.926 | 8.36e-01 | 0.629 | 0.629 | 1 |
| mean_speed_on_paper13 | NA | -0.3097 | 0.976 | 0.13242 | 0.759 | 9.69e-01 | 0.626 | 0.626 | NA |
| gmrt_on_paper13 | NA | -0.2820 | 1.004 | 0.11081 | 0.765 | 9.59e-01 | 0.606 | 0.606 | 2 |
| paper_time21 | NA | 0.1398 | 1.174 | -0.08999 | 1.084 | 8.16e-01 | 0.542 | 0.542 | NA |
| disp_index21 | NA | -0.2822 | 1.300 | -0.09963 | 0.981 | 7.01e-02 | 0.538 | 0.538 | NA |